Data Mining Project

  • Alex Panchot (M20190546)
  • Hugo Mentzingen (M20190215)
  • Rennan Valadares (M20190146)
In [1]:
#remove warnings
import warnings
warnings.filterwarnings("ignore")

#Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pydotplus
from sklearn import metrics
from sklearn.metrics.regression import r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from copy import deepcopy
from sklearn.svm import LinearSVR
from sklearn import preprocessing
from scipy.spatial.distance import euclidean
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, export_graphviz
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.cluster import k_means
from sklearn.externals.six import StringIO  
from IPython.display import Image  
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\six.py:31: DeprecationWarning: The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).
  "(https://pypi.org/project/six/).", DeprecationWarning)
In [2]:
#Global variables

env_params = {
    "InsConColumns" : ['Premiums in LOB: Motor',
                       'Premiums in LOB: Household',
                       'Premiums in LOB: Health',
                       'Premiums in LOB:  Life',
                       'Premiums in LOB: Work Compensations',
                       'Premium: Sum'],
    "ValEngColumns" : ['Educational Degree',
                       'Geographic Living Area',
                       'Has Children (Y=1)',
                       'Gross Monthly Salary',
                       'Customer Monetary Value',
                       'First Policy´s Age'],
    "CategoricalColumns" : ['Educational Degree',
                            'Geographic Living Area',
                            'Has Children (Y=1)'],
    "NumericalColumns" : ['First Policy´s Year',
                          'Gross Monthly Salary',
                          'Customer Monetary Value',
                          'Claims Rate',
                          'Premiums in LOB: Motor',
                          'Premiums in LOB: Household',
                          'Premiums in LOB: Health',
                          'Premiums in LOB:  Life',
                          'Premiums in LOB: Work Compensations',
                          'Age',
                          'First Policy´s Age',
                          'Premium: Sum']
}
In [3]:
#Function to split the DataFrame in data complete (without NaNs on the same row) and incomplete
#The rows that belong to the 'incomplete' dataframe have at least one NaN

def split(data_insurance, reset_index = False):
    data_insurance_complete = pd.DataFrame()
    data_insurance_incomplete = data_insurance[data_insurance.isna().any(axis=1)]
    if reset_index:
        data_insurance_incomplete.reset_index(inplace=True)
        data_insurance_incomplete.drop('index', axis=1, inplace=True)
    data_insurance_complete = data_insurance[~data_insurance.isna().any(axis=1)]
    return data_insurance_complete, data_insurance_incomplete
In [4]:
#Function to plot the histogram of a variable (feature)

def histogram(df, features):

    fig = plt.figure(figsize=(12, 20))
    
    for i, feature in enumerate(features):
    
        # Set up the plot
        ax = fig.add_subplot((len(features)//3)+1, 3, i + 1)
        
        # Draw the plot
        ax.hist(df[feature], bins = 50,
                 color = 'blue', edgecolor = 'black')

        # Title and labels
        ax.set_title('Histogram of ' + feature)
        ax.set_xlabel(feature)
        ax.set_ylabel('Customers')

    plt.subplots_adjust(hspace = 0.2)
    plt.show()
In [5]:
#Function to plot the correlation between variables

def plotCorrelation(df):
    sns.set()
    fig, ax = plt.subplots(figsize=(9,6))
    plt.rcParams.update({'font.size': 11})
    sns.heatmap(df.corr(method='pearson'), annot=True, fmt='.2f', cmap='RdBu', vmin=-1, vmax=1, linewidths=.9, ax = ax).set_title('Variables correlation')
    ax.set_xticks(np.arange(df.corr(method='pearson').shape[1]+1), minor=True)
    ax.set_yticks(np.arange(df.corr(method='pearson').shape[0]+1), minor=True)
    ax.grid(which="minor", color="w", linestyle='-', linewidth=3)
    ax.tick_params(which="minor", bottom=True, left=True)
In [6]:
#Function to separate the dataframe into 'Insurance Consumption' and 'Value & Engagement' features

def separateVariables(df):
    InsCon = df[env_params['InsConColumns']]
    ValEng = df[env_params['ValEngColumns']]

    return InsCon, ValEng
In [7]:
#Function to perform outliers removal based on interquartile range (IQR) 

def removeOutliers(df, cutoff1, cutoff2, special_features):
    
    outliers_to_remove = []
    for feature in env_params['NumericalColumns']:
        if feature in df:
            #calculate interquartile range
            q25, q75 = np.nanpercentile(df[feature], 25), np.nanpercentile(df[feature], 75)
            iqr = q75 - q25
            print(feature)
            print('Percentiles: 25th=%.3f, 75th=%.3f, IQR=%.3f' % (q25, q75, iqr))
        
            #calculate the outlier cutoff
            if feature not in special_features:
                cut_off = iqr * cutoff1
                lower, upper = q25 - cut_off, q75 + cut_off
            else:
                cut_off = iqr * cutoff2
                lower, upper = q25 - cut_off, q75 + cut_off
        
            #identify outliers
            outliers = [index for index, value in df[feature].iteritems() if value < lower or value > upper]
            print('Identified outliers: %d' % len(outliers))
            outliers_to_remove += outliers
    outliers_to_remove = set(outliers_to_remove)
    outliers_to_remove = sorted(outliers_to_remove)
    return outliers_to_remove
In [8]:
#Function to rescale the dataframe columns

def rescale(data_insurance):
    
    data_insurance_ = deepcopy(data_insurance)

    scaler = MinMaxScaler()
    scaled_features = scaler.fit_transform(data_insurance_.drop(columns=env_params['CategoricalColumns'], axis = 1))
    scaled_features = pd.DataFrame(scaled_features, columns = data_insurance_.drop(columns=env_params['CategoricalColumns']).columns)
    scaled_data_insurance = pd.merge(scaled_features, data_insurance_[env_params['CategoricalColumns']], left_index=True, right_index=True)

    return scaled_data_insurance
In [9]:
#Function to evaluate the best n_neighbors to use with KNN

def evaluateClassifier(data_insurance):
    data_insurance_complete, data_insurance_incomplete = split(data_insurance, reset_index=True)

    def createAndFitClassifier(k):
        clf = KNeighborsClassifier(n_neighbors=k)    
        incomplete = deepcopy(data_insurance_incomplete)
        complete = deepcopy(data_insurance_complete)   
        X_train, X_test, y_train, y_test = train_test_split(complete.loc[:,complete.columns != value].values,
                                                            complete.loc[:,value].values, test_size = 0.2, random_state = 0)
        trained_model = clf.fit(X_train, y_train)
        result = [clf, y_test, X_test, trained_model, incomplete, complete]
        return result
    
    accuracies_for_value_dict = {}

    for index, value in enumerate(env_params['CategoricalColumns']):

        accuracy_dict = {}

        for k in range(3,100):

            result = createAndFitClassifier(k)
            clf = result[0]
            y_test = result[1]
            X_test = result[2]                                                
            
            #calculate the model accuracy and storing the value into a dictionary
            y_pred = clf.predict(X_test)
            accuracy_matrix = confusion_matrix(y_test, y_pred)
            accuracy = accuracy_matrix.trace()/accuracy_matrix.sum()
            accuracy_dict[k] = accuracy
        
        accuracies_for_value_dict[value] = accuracy_dict
    
    return accuracies_for_value_dict
In [10]:
#Function that uses KNN to classify the missing values on CATEGORICAL columns

def classifyCategoricalData(data_insurance, n_neighbors_dict):
    data_insurance_complete, data_insurance_incomplete = split(data_insurance, reset_index=True)
  
    #Creating a classifier to fill the categorical data: Educational Degree, Geographic Living Area and Has Children (Y=1)
      
    for index, value in enumerate(env_params['CategoricalColumns']):

        if value in n_neighbors_dict:
            clf = KNeighborsClassifier(n_neighbors=n_neighbors_dict[value])    
        
            incomplete = deepcopy(data_insurance_incomplete)
            complete = deepcopy(data_insurance_complete)
        
            X_train, X_test, y_train, y_test = train_test_split(complete.loc[:,complete.columns != value].values,
                                                            complete.loc[:,value].values, test_size = 0.2, random_state = 0)
        
            trained_model = clf.fit(X_train, 
                                 y_train)
           
            #fill the numerical columns with the column mean
            incomplete.loc[:, ~incomplete.columns.isin(env_params['CategoricalColumns']) ] = incomplete.loc[:, 
                                ~incomplete.columns.isin(env_params['CategoricalColumns'])].apply(lambda column: column.fillna(column.mean()), axis=0)
        
            #Round Age and First Policy's Year
            #incomplete['Age'] = incomplete['Age'].apply(lambda x:round(x))
            incomplete['First Policy´s Age'] =  incomplete['First Policy´s Age'].apply(lambda x:round(x))
                
            #Categorical columns with the exception of the one we want to predict
            cat_without_the_column = deepcopy(env_params['CategoricalColumns'])
            cat_without_the_column.pop(index)
        
            #Fill the categorical columns with the exception of the one we want to predict with the mode
            incomplete.loc[:, incomplete.columns.isin(cat_without_the_column) ] = incomplete.loc[:, 
                        incomplete.columns.isin(cat_without_the_column)].apply(lambda column: column.fillna(int(column.mode())), axis=0)
              
            prediction = trained_model.predict(incomplete.loc[:,incomplete.columns != value])
            temp_df = pd.DataFrame(prediction.reshape(-1,1), columns = [value])
        
        
            #now we are filling data_insurance_incomplete 
            for ind in range(len(temp_df)):
                if np.isnan(data_insurance_incomplete[value][ind]):
                    data_insurance_incomplete[value][ind] = temp_df[value][ind]


        #and reconstructing the original dataframe
        dataset = pd.concat([data_insurance_complete, data_insurance_incomplete])
        dataset.set_index(dataset['Customer Identity'] - 1, inplace=True)
    
    return dataset
In [11]:
#funcion for checking which algorithm is the best for using on each column for NUMERICAL columns

def checking_choices(data_insurance, number_of_tests=10):
    data_insurance_complete, data_insurance_incomplete = split(data_insurance)

    choices = []
    better_for_each_column = []
    test_errors = {}
    
    #testing
    for i in range(number_of_tests):
        test, error_list = regressor_test(data_insurance)
        choices.append(test)
        test_errors.update({'Test: '+ str(i):error_list})
        
    #chosing the best algorithm for each column
    for i in range(len(data_insurance.columns)):
        l = []
        for j in range(len(choices)):
            l.append(choices[j][i])
        better_for_each_column.append(max(set(l), key = l.count))
    #return the better algorithm for each column, and the error list    
    return better_for_each_column, test_errors
In [12]:
#function for test which regressor is best for each numerical column
#Return a list of lists with the best algorithm for each test (choose the number of tests on checking_choices function )

def regressor_test(data_insurance):
#variables to hold the Mean Squared Errors for each model
    dt_errors = []
    linear_errors = []
    svr_errors = []    
    
    complete,incomplete = split(data_insurance)
    
    for i in complete.columns:
        i = complete.columns[0]
        X_train, X_test, y_train, y_test = train_test_split(complete.loc[:,complete.columns != i].values,
                                                            complete.loc[:,i].values, test_size = 0.2, random_state = 0)
        
        regressor1 = DecisionTreeRegressor(min_samples_split=112,min_samples_leaf=9)
        regressor2= LinearRegression()
        regressor3=LinearSVR()
        
        
        DT_Model = regressor1.fit(X_train, 
                                 y_train)
        Linear_trained_model2 = regressor2.fit(X_train, 
                                 y_train)
        SVR_trained_model3 = regressor3.fit(X_train, 
                                 y_train)  
        
        incomplete_2 = deepcopy(incomplete)
        incomplete_2.loc[:, incomplete.columns != i] = incomplete_2.loc[:, 
                                incomplete.columns != i].apply(lambda row: row.fillna(row.mean()), axis=1)

        y_pred1 = regressor1.predict(X_test)
        y_pred2 = regressor2.predict(X_test)
        y_pred3 = regressor3.predict(X_test)
        
        
        dt_errors.append(regressor1.score(X_test, y_test))
        linear_errors.append(regressor2.score(X_test, y_test))
        svr_errors.append(regressor3.score(X_test, y_test))
        
        
    errors_dict = {}
    errors_dict.update({"KN-Errors": dt_errors})
    errors_dict.update( {"Linear-Errors": linear_errors})
    errors_dict.update( {"SVR-Errors": svr_errors})


    #ROOT MEAN SQUARED ERROR 
    R2 = []

    #Filling R Squared for each column
    for i in range(0, len(complete.columns)):
        l = []
        l.extend((dt_errors[i], linear_errors[i], svr_errors[i]))
        
        if min(l) == dt_errors[i]:
            R2.append("KNN")
        elif min(l) == linear_errors[i]:
            R2.append("Linear")
        elif min(l) == svr_errors[i]:
            R2.append("SVR")
    


    return R2, errors_dict
In [13]:
#function to apply the regressors

def apply_regressors(choices, data_insurance, numerical_columns):

    complete,incomplete = split(data_insurance)

    for i,v in enumerate(complete.columns):
        
        #Check if it is a numerical column
        if v in numerical_columns:
            
            #use the choosen algorithm 
            if choices[i] == 'KNN':
                regressor = KNeighborsRegressor(5, 
                                                weights ='distance', 
                                                metric = 'euclidean')
            elif choices[i] == 'SVR':
                regressor = LinearSVR()
                
            elif choices[i] == 'Linear':
                regressor = LinearRegression()
                
            #Split in train-test data    
            X_train, X_test, y_train, y_test = train_test_split(complete.loc[:,complete.columns != v].values,
                                                                complete.loc[:,v].values, test_size = 0.2, random_state = 0)
            #Train the model
            trained_model = regressor.fit(X_train, 
                                     y_train)
            
            #Make predictions
            incomplete_2 = deepcopy(incomplete)
            incomplete_2.loc[:, incomplete.columns != v] = incomplete_2.loc[:, 
                                    incomplete.columns != v].apply(lambda row: row.fillna(row.mean()), axis=1)
            
            prediction = trained_model.predict(incomplete_2.loc[:,incomplete_2.columns != v])
            temp_df = pd.DataFrame(prediction.reshape(-1,1), columns = [v])
            
            #fill NaN's on data_arrivals_incomplete 
            for index in range(len(temp_df)):
                if np.isnan(incomplete.iloc[index,i]):
                    incomplete.iloc[index,i] = temp_df[v][index]



    #and filling the nan's on arrivals_df
    dataset = pd.concat([complete, incomplete])
    dataset.set_index(dataset['Customer Identity'] - 1, inplace=True)
    
    
    return dataset
In [14]:
#Given two features (columns) this function divides a dataframe into quartiles (number of customer on each quartile cell)

def quartileMatrix(df, feature1, feature2):
  quartile_id = ['q1','q2','q3','q4']
  df['f1_quartile'] = pd.qcut(df[feature1], 4, quartile_id)
  df['f2_quartile'] = pd.qcut(df[feature2], 4, quartile_id)
  quartiles_df = pd.DataFrame(index=quartile_id, columns=quartile_id)

  for i in quartile_id:
      for j in quartile_id:
          quartiles_df.ix[i,j] = len(df[(df["f1_quartile"]==i) & (df["f2_quartile"]==j)])
  
  return quartiles_df
 

2. Data Preparation

In [15]:
#_________________________Cleaning and Filling the Data with the algorithms___________________________________________

#Read the dataset
insurance_df = pd.read_csv('https://drive.google.com/uc?export=download&id=1dCJHxjPPob8sEfyY6ddja0DeYpos5Uq7')

#Create Age column
insurance_df['Age'] = insurance_df.loc[:, 'Brithday Year'].apply(lambda x : 2018 - x )

#Create First Policy´s Age column
insurance_df['First Policy´s Age'] = insurance_df.loc[:, 'First Policy´s Year'].apply(lambda x : 2018 - x )

#Drop Birthday Year and First Policy´s Year
insurance_df.drop(['Brithday Year', 'First Policy´s Year' ], axis=1, inplace=True)

#Adding the sum of all premiums paid as a column 
insurance_df['Premium: Sum']=insurance_df[['Premiums in LOB: Work Compensations',
                                                         'Premiums in LOB:  Life',
                                                         'Premiums in LOB: Health',
                                                         'Premiums in LOB: Household',
                                                         'Premiums in LOB: Motor']].sum(axis=1)

2.1. NaNs and outliers

In [16]:
#Count the number of rows with NaNs
rows_with_nans = insurance_df.shape[0] - insurance_df.dropna().shape[0]
print(rows_with_nans)
print(rows_with_nans/insurance_df.shape[0])
309
0.030011655011655012
In [17]:
outliers = removeOutliers(insurance_df, 1.5, 1.5, [])
Gross Monthly Salary
Percentiles: 25th=1706.000, 75th=3290.250, IQR=1584.250
Identified outliers: 2
Customer Monetary Value
Percentiles: 25th=-9.440, 75th=399.778, IQR=409.218
Identified outliers: 110
Claims Rate
Percentiles: 25th=0.390, 75th=0.980, IQR=0.590
Identified outliers: 15
Premiums in LOB: Motor
Percentiles: 25th=190.590, 75th=408.300, IQR=217.710
Identified outliers: 6
Premiums in LOB: Household
Percentiles: 25th=49.450, 75th=290.050, IQR=240.600
Identified outliers: 633
Premiums in LOB: Health
Percentiles: 25th=111.800, 75th=219.820, IQR=108.020
Identified outliers: 24
Premiums in LOB:  Life
Percentiles: 25th=9.890, 75th=57.790, IQR=47.900
Identified outliers: 625
Premiums in LOB: Work Compensations
Percentiles: 25th=10.670, 75th=56.790, IQR=46.120
Identified outliers: 621
Age
Percentiles: 25th=35.000, 75th=65.000, IQR=30.000
Identified outliers: 0
First Policy´s Age
Percentiles: 25th=26.000, 75th=38.000, IQR=12.000
Identified outliers: 1
Premium: Sum
Percentiles: 25th=626.425, 75th=824.068, IQR=197.643
Identified outliers: 639
In [18]:
features = ['Gross Monthly Salary',
                          'Customer Monetary Value',
                          'Claims Rate',
                          'Premiums in LOB: Motor',
                          'Premiums in LOB: Household',
                          'Premiums in LOB: Health',
                          'Premiums in LOB:  Life',
                          'Premiums in LOB: Work Compensations',
                          'Age',
                          'First Policy´s Age',
                          'Premium: Sum']
histogram(insurance_df, features)
In [19]:
outliers = removeOutliers(insurance_df, 1.5, 3, features)
Gross Monthly Salary
Percentiles: 25th=1706.000, 75th=3290.250, IQR=1584.250
Identified outliers: 2
Customer Monetary Value
Percentiles: 25th=-9.440, 75th=399.778, IQR=409.218
Identified outliers: 25
Claims Rate
Percentiles: 25th=0.390, 75th=0.980, IQR=0.590
Identified outliers: 15
Premiums in LOB: Motor
Percentiles: 25th=190.590, 75th=408.300, IQR=217.710
Identified outliers: 6
Premiums in LOB: Household
Percentiles: 25th=49.450, 75th=290.050, IQR=240.600
Identified outliers: 123
Premiums in LOB: Health
Percentiles: 25th=111.800, 75th=219.820, IQR=108.020
Identified outliers: 4
Premiums in LOB:  Life
Percentiles: 25th=9.890, 75th=57.790, IQR=47.900
Identified outliers: 141
Premiums in LOB: Work Compensations
Percentiles: 25th=10.670, 75th=56.790, IQR=46.120
Identified outliers: 162
Age
Percentiles: 25th=35.000, 75th=65.000, IQR=30.000
Identified outliers: 0
First Policy´s Age
Percentiles: 25th=26.000, 75th=38.000, IQR=12.000
Identified outliers: 1
Premium: Sum
Percentiles: 25th=626.425, 75th=824.068, IQR=197.643
Identified outliers: 137
In [20]:
print(len(outliers))
print(insurance_df.shape[0])
print(len(outliers)/insurance_df.shape[0])
439
10296
0.04263791763791764
In [21]:
outliers_data = []
for i in outliers:
    outliers_data.append(insurance_df.iloc[i,:])
insurance_df.drop(outliers, inplace=True)
insurance_df.shape
Out[21]:
(9857, 15)
In [22]:
histogram(insurance_df, features)
In [23]:
#Drop rows with more than 3 NaN's
insurance_df.dropna(thresh=(len(insurance_df.columns) - 3), inplace=True, axis=0)
insurance_df.shape
Out[23]:
(9857, 15)

2.2. ‘Age’ column assessment

In [24]:
#Counting the rows with 'First Policy´s Age' > 'Age'
#It shows us a possible problem with the data
insurance_df[insurance_df['First Policy´s Age'] > insurance_df['Age']].shape[0]
Out[24]:
1702
In [25]:
#Proportion of customers below 18 that have children
#Is 'Age' trustworthy?
x = insurance_df[(insurance_df['Age'] <= 18) & (insurance_df['Has Children (Y=1)'] == 1)].shape[0]
y = insurance_df[(insurance_df['Age'] <= 18) & (insurance_df['Has Children (Y=1)'] == 0)].shape[0]
print("{:.2f}".format(x/(x+y)*100),'%')
77.27 %

2.3. Categorical features classification

In [26]:
data_insurance = deepcopy(insurance_df)

#Encoding Educational Degree and returning back the NaN's

data_insurance['Educational Degree'] = data_insurance['Educational Degree'].apply(str)

labelencoder_X = LabelEncoder()

data_insurance.loc[:,'Educational Degree'] = labelencoder_X.fit_transform(data_insurance.loc[:,'Educational Degree'])

data_insurance['Educational Degree'] = data_insurance['Educational Degree'].apply(lambda x : np.nan if x == 4 else x )
In [27]:
data_insurance.tail()
Out[27]:
Customer Identity Educational Degree Gross Monthly Salary Geographic Living Area Has Children (Y=1) Customer Monetary Value Claims Rate Premiums in LOB: Motor Premiums in LOB: Household Premiums in LOB: Health Premiums in LOB: Life Premiums in LOB: Work Compensations Age First Policy´s Age Premium: Sum
10289 10290 1.0 3498.0 4.0 0.0 245.60 0.67 227.82 270.60 160.92 100.13 69.90 75.0 32.0 829.37
10291 10292 3.0 3188.0 2.0 0.0 -0.11 0.96 393.74 49.45 173.81 9.78 14.78 69.0 34.0 641.56
10293 10294 2.0 2918.0 1.0 1.0 524.10 0.21 403.63 132.80 142.25 12.67 4.89 42.0 24.0 696.24
10294 10295 0.0 1971.0 2.0 1.0 250.05 0.65 188.59 211.15 198.37 63.90 112.91 41.0 37.0 774.92
10295 10296 3.0 2815.0 1.0 1.0 463.75 0.27 414.08 94.45 141.25 6.89 12.89 37.0 28.0 669.56
In [28]:
#Verify the optimal n_neighbors to our KNN classifiers
scaled_data_insurance = rescale(data_insurance)
scaled_data_insurance.drop('Customer Identity', axis=1)

accuracies_for_column_dict = evaluateClassifier(scaled_data_insurance)
fig, ax = plt.subplots(3, figsize=(15,10))
fig.suptitle('KNN - Accuracy x n_neighbors')
ax[0].plot(list(accuracies_for_column_dict['Educational Degree'].keys()),
                                    list(accuracies_for_column_dict['Educational Degree'].values()),
                                    'bx-') 
ax[0].set_title('Educational Degree')
ax[0].grid(True, which='both', color='r', linestyle='-', linewidth=2)

ax[1].plot(list(accuracies_for_column_dict['Geographic Living Area'].keys()),
                                    list(accuracies_for_column_dict['Geographic Living Area'].values()),
                                    'bx-') 
ax[1].set_title('Geographic Living Area')
ax[0].grid(True, which='both', color='r', linestyle='-', linewidth=2)

ax[2].plot(list(accuracies_for_column_dict['Has Children (Y=1)'].keys()),
                                    list(accuracies_for_column_dict['Has Children (Y=1)'].values()),
                                    'bx-') 
ax[2].set_title('Has Children (Y=1)')
ax[2].grid(True, which='both', color='r', linestyle='-', linewidth=2)

for ax in ax.flat:
    ax.set(xlabel='n_neighbors', ylabel='Accuracy')

plt.show()
In [29]:
#Considering we didn't achieve good estimations for the first two categorical columns, we will drop the rows which contain NaNs
#on these columns
data_insurance.dropna(subset=['Educational Degree', 'Geographic Living Area'], inplace=True)

#Setting the optimal number of neighbors to categorical column regression
n_neighbors_dict = {#'Educational Degree' : 8,
                    #'Geographic Living Area' : 11,
                    'Has Children (Y=1)' : 21}
In [30]:
#Fill categorical data with the KNN predicted Values
data_insurance = classifyCategoricalData(data_insurance, n_neighbors_dict)
In [31]:
#Check choices and errors
choices, errors = checking_choices(data_insurance)
errors
Out[31]:
{'Test: 0': {'KN-Errors': [-0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223],
  'Linear-Errors': [-0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575],
  'SVR-Errors': [-1.0505311845526077,
   -1.6148069491026735,
   -2.0810799951996346,
   -2.5465004946250773,
   -0.9104060652674717,
   -2.62265416478973,
   -1.1992250168260385,
   -0.3724819030349744,
   -0.39877397310963847,
   -1.4491179931361136,
   -0.29047954758576555,
   -0.24865812147866317,
   -3.6756269017661003,
   -0.9310337848444874,
   -0.03567324898354585]},
 'Test: 1': {'KN-Errors': [-0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223],
  'Linear-Errors': [-0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575],
  'SVR-Errors': [-0.6684846111569442,
   -0.07648128538629728,
   -1.1609885586096338,
   -1.193040387747628,
   -1.15554347875073,
   -0.7377618897177673,
   -0.03434380759960853,
   -0.40738845999920925,
   -0.22733763995986234,
   -0.2589034836583963,
   -1.3507276337769545,
   -2.6028492304644866,
   -0.35873257393900526,
   -0.8600132594878689,
   -1.4745883467333254]},
 'Test: 2': {'KN-Errors': [-0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223],
  'Linear-Errors': [-0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575],
  'SVR-Errors': [-0.0937749170719171,
   -0.1800388114596132,
   -0.8194967502565313,
   -0.6598194581618719,
   -2.231812680894496,
   -0.04574258323607383,
   -0.022973687792542163,
   -0.1239103496568994,
   -3.924865089950055,
   -0.35234399819037865,
   -0.6050631403069195,
   -0.029851114102854348,
   -0.47921398605175813,
   -2.1024449252141917,
   -0.12954254161536372]},
 'Test: 3': {'KN-Errors': [-0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223],
  'Linear-Errors': [-0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575],
  'SVR-Errors': [-1.078635619070592,
   -0.9649910290648538,
   -1.2761268268973813,
   -0.7073369328658792,
   -0.3005223803962449,
   -0.6981553664932805,
   -0.596129515032032,
   -0.2977757662329441,
   -0.6542363158032667,
   -1.5496489241205214,
   -0.7910902390024046,
   -0.024128612571503227,
   -0.4676063461519857,
   -1.0302324106765943,
   -0.15996781209902688]},
 'Test: 4': {'KN-Errors': [-0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223],
  'Linear-Errors': [-0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575],
  'SVR-Errors': [-2.6379465357837617,
   -0.04526342523377247,
   -1.118547817021097,
   -0.22487796467611032,
   -0.8518239262859582,
   -0.41406179785427266,
   -1.4455282230126967,
   -1.2159662580854484,
   -1.0388140996759416,
   -0.06619173482688412,
   -0.6422832908006513,
   -1.18780346030139,
   -0.66739525778494,
   -0.20301030660469976,
   -2.9798101319214787]},
 'Test: 5': {'KN-Errors': [-0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223],
  'Linear-Errors': [-0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575],
  'SVR-Errors': [-0.9256584404317505,
   -1.3786586501850233,
   -0.04455646629177523,
   -1.4001864943312219,
   -0.99712978957579,
   -0.20312940372120747,
   -0.5574502079314803,
   -0.5270872037962144,
   -0.510503898233059,
   -0.03171388322439328,
   -3.0726175355722685,
   -1.1832094621063196,
   -0.02375790778246234,
   -1.3609574433012295,
   -1.503806509459371]},
 'Test: 6': {'KN-Errors': [-0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223],
  'Linear-Errors': [-0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575],
  'SVR-Errors': [-0.4898331675255432,
   -0.20484902011996442,
   -0.041218313042736376,
   -1.3066538915314143,
   -0.16064319152106088,
   -0.8789862138492806,
   -1.4677501399439818,
   -0.27664489863611874,
   -0.22819373144577892,
   -0.6871152131421756,
   -1.2951726025300436,
   -0.07922056735949079,
   -0.5763816238019397,
   -1.5388986002399005,
   -0.7783746732603383]},
 'Test: 7': {'KN-Errors': [-0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223],
  'Linear-Errors': [-0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575],
  'SVR-Errors': [-0.11222434502778911,
   -0.05996549373084159,
   -1.1937083987518844,
   -0.23784907349199358,
   -1.4844508307916242,
   -3.1233283357202524,
   -0.8403790390306296,
   -1.3734332936507858,
   -0.3151188256030837,
   -1.92707897065572,
   -0.7678771730650347,
   -0.32931742227188576,
   -0.2304387286963483,
   -0.07746581719892665,
   -0.17855728937904436]},
 'Test: 8': {'KN-Errors': [-0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223],
  'Linear-Errors': [-0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575],
  'SVR-Errors': [-0.5939724108207485,
   -0.5919646997463552,
   -2.9324273356432484,
   -0.6436154447758824,
   -0.6039065943635058,
   -0.10914502587415997,
   -1.0483350804913485,
   -2.2089263063033906,
   -0.3340390906580366,
   -1.9636766073046275,
   -0.016907966547832,
   -0.5961973441403234,
   -1.5343003694007304,
   -0.2001879625333447,
   -0.02935093217513529]},
 'Test: 9': {'KN-Errors': [-0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223,
   -0.12607248356027223],
  'Linear-Errors': [-0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575,
   -0.002367186416642575],
  'SVR-Errors': [-0.042474214420406886,
   -1.5920752036013175,
   -0.6900376547869498,
   -0.7162914768581423,
   -0.9747815559937774,
   -2.733623225763514,
   -0.9086474242884,
   -3.6190455867516187,
   -0.6029152351971567,
   -0.4397159176494774,
   -0.4593928644968921,
   -2.2293681806257726,
   -0.04471315766081552,
   -1.91599987243651,
   -3.8133455005777463]}}

2.4. Numerical features regression

In [32]:
'''Fill numerical data with the best regressor algorithm - We decided not to use apply_regressors function anymore.
after checking the errors of choices we decided to use a genetic algorithm for getting better parameters using
decision tree regressor''' 
#data_insurance = apply_regressors(choices,data_insurance, env_params['NumericalColumns'])
#Full dataset
Out[32]:
'Fill numerical data with the best regressor algorithm - We decided not to use apply_regressors function anymore.\nafter checking the errors of choices we decided to use a genetic algorithm for getting better parameters using\ndecision tree regressor'

Genetic Algorithms Results

Summarize:

  • Gross Monthly Salary (Index 10)
      Low R² - Discarded (Drop NaN's)
  • Premiums in LOB: Motor (Index 13)
      The best solution was Test 5 - [‘friedman_mse’,  min_sample_split=122, min_samples_leaf=1, max_features=3, max_depth=28] 
      Fitness: 0.8157 – R² ~ 81.57% 
  • Premiums in LOB: Health (Index 15)
      Low R² - Discarded (Drop NaN's)
  • Premiums in LOB: Life (Index 16)
      The best solution was Test 4 - [‘friedman_mse’, 2, min_samples_leaf=3,  max_features=9, max_depth=9] 
      Fitness: 0.6286 – R² ~ 62.86% 
  • Premiums in LOB: Work Compensations (Index 17)
      Low R² - Discarded (Drop NaN's)
  • First Policy´s Age (Index 19)
      Low R² - Discarded (Drop NaN's)
In [33]:
'''This part of the code fill the NaN's on Columns Motor and Life Premiums with the best
result from Genetic Algorithm configuration'''

column_for_regress = {'Premiums in LOB: Motor':['friedman_mse', 122, 1,3, 28],
                      'Premiums in LOB:  Life': ['friedman_mse', 2, 3, 9, 9]}



for i in column_for_regress: 
    complete,incomplete = split(data_insurance)
    #Split in train-test data    
    X_train, X_test, y_train, y_test = train_test_split(complete.loc[:,complete.columns != i].values,
                                                        complete.loc[:,i].values, test_size = 0.35, random_state = 1)
    
    
    regressor = DecisionTreeRegressor(criterion=column_for_regress[i][0],
                            min_samples_split =column_for_regress[i][1],
                            min_samples_leaf=column_for_regress[i][2],
                            max_features=column_for_regress[i][3],
                            max_depth=column_for_regress[i][4],
                            random_state=1,
                            )

    
    #Train the model
    trained_model = regressor.fit(X_train, 
                             y_train)
    
    #Make predictions
    incomplete_2 = deepcopy(incomplete)
    incomplete_2.loc[:, incomplete.columns != i] = incomplete_2.loc[:, 
                            incomplete.columns != i].apply(lambda row: row.fillna(row.mean()), axis=1)
    
    prediction = trained_model.predict(incomplete_2.loc[:,incomplete_2.columns != i])
    temp_df = pd.DataFrame(prediction.reshape(-1,1), columns = [i])
    
    col_ind = 0
    if i == 'Premiums in LOB: Motor':
        col_ind = -7
    elif i == 'Premiums in LOB:  Life':
        col_ind = -4
        
    #fill NaN's on data_arrivals_incomplete 
    for index in range(len(temp_df)):
        if np.isnan(incomplete.iloc[index,col_ind]):
            incomplete.iloc[index,col_ind] = temp_df[i][index]    
    
    y_pred = regressor.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    
    print(r2)
    
    #and filling the nan's on arrivals_df
    data_insurance = pd.concat([complete, incomplete])

    data_insurance.set_index(data_insurance['Customer Identity'] - 1, inplace=True)
0.8681863204849545
0.6008966695981938

2.5. Drop the rest of NaNs

The decision here is to drop all the NaNs left, since we cannot use a good prediction on the columns. Doing that we are losing 144 rows (1.44% of the dataset, what we considered as an OK choice)

In [34]:
data_insurance.dropna(inplace=True)
In [35]:
data_insurance.isna().sum()
Out[35]:
Customer Identity                      0
Educational Degree                     0
Gross Monthly Salary                   0
Geographic Living Area                 0
Has Children (Y=1)                     0
Customer Monetary Value                0
Claims Rate                            0
Premiums in LOB: Motor                 0
Premiums in LOB: Household             0
Premiums in LOB: Health                0
Premiums in LOB:  Life                 0
Premiums in LOB: Work Compensations    0
Age                                    0
First Policy´s Age                     0
Premium: Sum                           0
dtype: int64
In [36]:
data_insurance
Out[36]:
Customer Identity Educational Degree Gross Monthly Salary Geographic Living Area Has Children (Y=1) Customer Monetary Value Claims Rate Premiums in LOB: Motor Premiums in LOB: Household Premiums in LOB: Health Premiums in LOB: Life Premiums in LOB: Work Compensations Age First Policy´s Age Premium: Sum
Customer Identity
0 1 1.0 2177.0 1.0 1.0 380.97 0.39 375.85 79.45 146.36 47.01 16.890000 36.0 33.0 665.56
1 2 1.0 677.0 4.0 1.0 -131.13 1.12 77.46 416.20 116.69 194.48 106.130000 23.0 37.0 910.96
2 3 0.0 2277.0 3.0 0.0 504.67 0.28 206.15 224.50 124.58 86.35 99.020000 48.0 27.0 740.60
3 4 2.0 1099.0 4.0 1.0 -16.99 0.99 182.48 43.35 311.17 35.34 28.340000 37.0 28.0 600.68
4 5 2.0 1763.0 4.0 1.0 35.23 0.90 338.62 47.80 182.59 18.78 41.450000 45.0 32.0 629.24
5 6 1.0 2566.0 4.0 1.0 -24.33 1.00 440.75 18.90 114.80 7.00 7.670000 62.0 32.0 589.12
6 7 1.0 4103.0 4.0 0.0 -66.01 1.05 156.92 295.60 317.95 14.67 26.340000 75.0 39.0 811.48
7 8 1.0 1743.0 4.0 1.0 -144.91 1.13 248.27 397.30 144.36 66.68 53.230000 44.0 30.0 909.84
8 9 2.0 1862.0 1.0 1.0 356.53 0.36 344.51 18.35 210.04 8.78 9.890000 40.0 37.0 591.57
9 10 2.0 3842.0 1.0 0.0 -119.35 1.12 209.26 182.25 271.94 39.23 55.120000 70.0 42.0 757.80
10 11 2.0 3995.0 4.0 0.0 290.17 0.53 296.50 116.70 227.71 18.67 10.890000 73.0 28.0 670.47
11 12 2.0 2834.0 3.0 1.0 405.74 0.46 249.49 216.70 252.38 36.34 35.450000 60.0 33.0 790.36
12 13 2.0 3757.0 1.0 0.0 2.34 0.97 206.04 340.05 257.05 43.12 15.780000 69.0 37.0 862.04
14 15 1.0 1981.0 1.0 1.0 -89.00 1.12 504.43 -70.00 72.79 10.89 16.890000 43.0 35.0 535.00
15 16 2.0 974.0 4.0 1.0 290.06 0.53 318.06 93.90 247.49 -3.00 19.670000 39.0 30.0 676.12
16 17 1.0 2437.0 1.0 1.0 100.58 0.89 119.80 647.35 168.92 125.58 37.230000 29.0 27.0 1098.88
17 18 2.0 2728.0 4.0 1.0 78.91 0.85 446.64 107.80 86.35 3.89 27.560000 48.0 37.0 672.24
18 19 0.0 1414.0 3.0 1.0 230.38 0.71 313.17 373.40 121.80 14.78 72.570000 36.0 37.0 895.72
19 20 3.0 2751.0 1.0 1.0 352.96 0.36 402.41 9.45 125.69 24.45 30.560000 56.0 30.0 592.56
20 21 3.0 1704.0 4.0 1.0 373.73 0.39 329.84 65.00 221.93 3.89 33.340000 32.0 32.0 654.00
22 23 1.0 1495.0 1.0 1.0 -89.79 1.08 209.04 308.95 192.48 64.79 54.900000 32.0 42.0 830.16
23 24 1.0 2870.0 4.0 1.0 290.05 0.64 222.04 329.50 248.38 48.01 33.560000 57.0 20.0 881.49
24 25 2.0 3201.0 4.0 0.0 108.13 0.81 420.30 128.35 135.58 15.67 4.780000 66.0 30.0 704.68
25 26 3.0 3776.0 2.0 0.0 376.30 0.41 383.74 103.90 154.14 17.56 15.780000 74.0 44.0 675.12
26 27 3.0 4281.0 3.0 0.0 430.86 0.38 330.84 162.80 228.82 -5.00 22.780000 81.0 40.0 740.24
27 28 3.0 3619.0 1.0 0.0 -7.22 0.97 449.75 58.90 112.91 17.78 9.780000 76.0 39.0 649.12
28 29 2.0 3639.0 4.0 0.0 264.17 0.50 301.28 18.90 258.05 -4.00 -0.110000 66.0 23.0 574.12
29 30 2.0 2660.0 3.0 0.0 2.00 0.96 524.21 25.00 54.90 7.00 -5.000000 46.0 38.0 606.11
30 31 1.0 2148.0 4.0 1.0 427.76 0.19 483.09 -15.00 92.24 3.89 -5.110000 43.0 30.0 559.11
31 32 1.0 3490.0 4.0 0.0 240.16 0.69 320.95 335.05 146.25 46.12 9.670000 65.0 34.0 858.04
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5525 5526 3.0 3036.0 4.0 1.0 -18.00 0.99 525.10 24.45 57.01 7.00 3.741696 51.0 23.0 613.56
5733 5734 2.0 1833.0 1.0 1.0 429.09 0.26 425.19 28.90 160.14 0.89 9.434369 44.0 23.0 615.12
5950 5951 2.0 2255.0 4.0 0.0 251.38 0.65 326.95 265.60 196.26 10.78 27.476250 52.0 28.0 799.59
6215 6216 1.0 1110.0 4.0 1.0 716.91 0.24 250.16 491.75 216.93 21.56 15.330952 33.0 24.0 980.40
6305 6306 2.0 2996.0 4.0 1.0 588.66 0.08 514.32 89.45 54.90 9.89 5.580931 55.0 26.0 668.56
6415 6416 2.0 2975.0 3.0 1.0 541.66 0.11 568.55 40.00 28.56 1.89 0.424211 52.0 23.0 639.00
6728 6729 2.0 2701.0 1.0 0.0 483.43 0.13 490.65 0.00 73.57 17.78 5.580931 58.0 38.0 582.00
6889 6890 1.0 2249.0 4.0 1.0 -46.56 1.03 316.95 138.35 208.04 21.56 22.899798 41.0 32.0 684.90
7176 7177 2.0 3476.0 4.0 1.0 296.38 0.64 215.93 369.50 258.05 48.12 16.390345 77.0 39.0 891.60
7443 7444 3.0 1909.0 4.0 1.0 -137.00 1.21 520.10 -55.00 58.01 -1.00 10.640000 50.0 23.0 522.11
7462 7463 2.0 3101.0 1.0 0.0 79.24 0.83 338.84 10.00 154.03 94.24 35.247700 58.0 26.0 597.11
7521 7522 2.0 3596.0 4.0 0.0 363.75 0.42 225.82 146.15 284.72 16.78 26.762500 67.0 30.0 673.47
7625 7626 2.0 3164.0 3.0 0.0 358.74 0.46 404.41 143.35 146.25 20.67 17.898108 64.0 25.0 714.68
7979 7980 1.0 3804.0 3.0 0.0 230.27 0.67 255.27 212.25 260.16 44.12 28.807787 77.0 24.0 771.80
8186 8187 2.0 2398.0 1.0 1.0 -75.00 1.09 499.54 -50.00 75.57 5.00 5.580931 54.0 34.0 530.11
8423 8424 1.0 1565.0 2.0 1.0 44.11 0.90 437.86 75.00 146.36 5.89 9.434369 36.0 43.0 665.11
8519 8520 2.0 3796.0 4.0 0.0 350.96 0.36 346.51 29.45 181.48 27.56 24.350571 72.0 36.0 585.00
8558 8559 3.0 2557.0 1.0 0.0 470.42 0.27 364.07 128.35 181.48 3.00 14.913415 67.0 44.0 676.90
8630 8631 2.0 2530.0 3.0 1.0 38.56 0.90 462.20 64.45 107.02 0.89 5.955000 45.0 44.0 634.56
8674 8675 1.0 2318.0 1.0 1.0 -29.78 1.01 440.64 39.45 106.91 4.78 7.877228 61.0 29.0 591.78
8843 8844 2.0 2386.0 4.0 1.0 -14.22 0.98 514.43 9.45 57.90 10.78 5.580931 51.0 28.0 592.56
9111 9112 2.0 2449.0 2.0 1.0 40.67 0.90 498.76 79.45 77.68 13.78 5.580931 56.0 24.0 669.67
9119 9120 2.0 1763.0 3.0 1.0 548.77 0.23 379.74 216.70 127.58 22.67 11.819000 41.0 40.0 746.69
9127 9128 2.0 1622.0 4.0 1.0 -40.00 1.03 514.32 -5.00 61.79 -5.00 5.580931 42.0 44.0 566.11
9463 9464 1.0 2654.0 3.0 1.0 410.97 0.25 425.08 24.45 119.69 10.89 12.156954 45.0 30.0 580.11
9546 9547 2.0 4199.0 4.0 0.0 -5.33 0.98 200.37 317.85 308.39 19.67 33.521803 70.0 41.0 846.28
9881 9882 1.0 2882.0 4.0 0.0 -70.55 1.09 514.43 -45.55 58.79 2.00 5.580931 55.0 30.0 529.67
10001 10002 2.0 2234.0 2.0 1.0 490.20 0.46 290.72 491.75 140.25 23.45 13.878889 49.0 35.0 946.17
10284 10285 2.0 1504.0 4.0 1.0 -1.55 0.96 390.63 29.45 179.70 -6.00 17.898108 31.0 38.0 593.78
10285 10286 2.0 3878.0 4.0 1.0 -57.45 1.04 269.05 217.25 219.93 32.45 22.649355 70.0 33.0 738.68

9646 rows × 15 columns

3. Data preprocessing

In [37]:
#Preparing to plot correlation
plot_data_insurance = deepcopy(data_insurance)
#plot_data_insurance.drop(env_params['Outliers'], inplace=True)
plot_data_insurance.drop('Customer Identity', axis=1, inplace=True)

3.1. Features correlation

In [38]:
#Plotting the correlation between all variables
plotCorrelation(plot_data_insurance)
In [39]:
#Since 'Age' and 'Gross Monthly Salary' have a high correlation and 'Age' appears not to be trustworthy (described above),
#we'll drop the 'Age' column

plot_data_insurance.drop('Age', axis=1, inplace=True)
data_insurance.drop('Age', axis=1, inplace=True)

#Since 'Customer Monetary Value' and 'Claims Rate' have a high (inverse) correlation, we'll maintain only the CMV

plot_data_insurance.drop('Claims Rate', axis=1, inplace=True)
data_insurance.drop('Claims Rate', axis=1, inplace=True)
In [40]:
#Separating variables
InsCon, ValEng = separateVariables(plot_data_insurance)
In [41]:
#Plotting the correlation between Insurance Consumption variables
plotCorrelation(InsCon)
In [42]:
#print(InsCon.mean(), '\n\n', InsCon.sum())

pd.options.display.float_format = '$ {:,.2f}'.format
ins_sum = pd.DataFrame(InsCon.sum(), columns=['Sum'])
ins_mean = pd.DataFrame(InsCon.mean(), columns=['Mean'])
ins_median = pd.DataFrame(InsCon.median(), columns=['Median'])
pd.concat([ins_sum, ins_mean, ins_median], axis=1, sort=False)
Out[42]:
Sum Mean Median
Premiums in LOB: Motor $ 2,945,215.76 $ 305.33 $ 307.28
Premiums in LOB: Household $ 1,844,924.60 $ 191.26 $ 128.90
Premiums in LOB: Health $ 1,641,113.59 $ 170.13 $ 165.03
Premiums in LOB: Life $ 366,624.99 $ 38.01 $ 24.56
Premiums in LOB: Work Compensations $ 357,539.00 $ 37.07 $ 24.67
Premium: Sum $ 7,154,165.69 $ 741.67 $ 694.57
In [43]:
pd.options.display.float_format = '{:,.2f}'.format
In [44]:
#Plotting the correlation between Customer Value & Engagement variables
plotCorrelation(ValEng)

4. Analysis

4.1. Quartiles (a priori grouping)

In [45]:
def quartilePlotter(df, y_variables, x_variable):

    widths = [6, 6, 6]
    heights = [6, 6]    
    gs_kw = dict(width_ratios=widths, height_ratios=heights)

    f,((ax1,ax2,ax3),(ax4,ax5,ax6)) = plt.subplots(2,3, figsize=(18,12), gridspec_kw=gs_kw)
    #ax1.get_shared_y_axes().join(ax2,ax3)
    #ax4.get_shared_y_axes().join(ax5)
    
    g1 = sns.heatmap(quartileMatrix(df, y_variables[0], x_variable), annot=True, fmt="d", cmap="Greens",cbar=False,ax=ax1)
    g1.set_ylabel(y_variables[0])
    g1.set_xlabel(x_variable)
    g1.set_ylim(4,0)
    g2 = sns.heatmap(quartileMatrix(df, y_variables[1], x_variable), annot=True, fmt="d", cmap="Greens",cbar=False,ax=ax2)
    g2.set_ylabel(y_variables[1])
    g2.set_xlabel(x_variable)
    g2.set_yticks([])
    g3 = sns.heatmap(quartileMatrix(df, y_variables[2], x_variable),cmap="Greens", annot=True, fmt="d", cbar=False, ax=ax3)
    g3.set_ylabel(y_variables[2])
    g3.set_xlabel(x_variable)
    g3.set_yticks([])
    g4 = sns.heatmap(quartileMatrix(df, y_variables[3], x_variable), annot=True, fmt="d", cmap="Greens",cbar=False,ax=ax4)
    g4.set_ylabel(y_variables[3])
    g4.set_xlabel(x_variable)
    g4.set_yticks([])
    g5 = sns.heatmap(quartileMatrix(df, y_variables[4], x_variable),cmap="Greens", annot=True, fmt="d", cbar=False, ax=ax5)
    g5.set_ylabel(y_variables[4])
    g5.set_xlabel(x_variable)
    g5.yaxis.tick_right()
    g5.set_ylim(4, 0)
    f.delaxes(ax6)
    ax1.invert_yaxis()
    ax2.invert_yaxis()
    ax3.invert_yaxis()
    ax4.invert_yaxis()
    ax5.invert_yaxis()
In [46]:
#Plotting the quartile matrix for Insurance Consumption features
#Show insurance type sales potential in relation to sum of premiums

y_variables = ['Premiums in LOB: Motor',
               'Premiums in LOB: Household',
               'Premiums in LOB: Health',
               'Premiums in LOB:  Life',
               'Premiums in LOB: Work Compensations']
x_variable = 'Premium: Sum'

quartilePlotter(InsCon, y_variables, x_variable)
In [47]:
#Plotting the quartile matrix for Insurance Consumption features versus Salary
#Show insurance type sales potential in relation to salary

y_variables = ['Premiums in LOB: Motor',
               'Premiums in LOB: Household',
               'Premiums in LOB: Health',
               'Premiums in LOB:  Life',
               'Premiums in LOB: Work Compensations']
x_variable = 'Gross Monthly Salary'

quartilePlotter(plot_data_insurance, y_variables, x_variable)

4.2. Boxplots

In [48]:
sns.set(style="whitegrid")
fig, ax = plt.subplots(figsize=(9,6))
ax = sns.boxplot(x='Educational Degree', y='Premiums in LOB:  Life', hue='Has Children (Y=1)', data=plot_data_insurance, palette="Set2")
In [49]:
sns.set(style="whitegrid")
fig, ax = plt.subplots(figsize=(9,6))
ax = sns.boxplot(x='Educational Degree', y='Premiums in LOB: Work Compensations', hue='Has Children (Y=1)', data=plot_data_insurance, palette="Set2")
In [50]:
fig, ax = plt.subplots(figsize=(9,6))
ax = sns.boxplot(x='Educational Degree', y='Premiums in LOB: Health', hue='Has Children (Y=1)', data=plot_data_insurance, palette="Set2")
In [51]:
quartiles_df = quartileMatrix(data_insurance, 'Premium: Sum', 'Customer Monetary Value')

4.3. Decision tree

In [52]:
#Decision tree implementation

#split dataset in features and target variable
feature_cols = ['Educational Degree',
                'Geographic Living Area',
                'Has Children (Y=1)',
                'Gross Monthly Salary',
                'First Policy´s Age']
#feature_cols = ['Gross Monthly Salary', 'First Policy´s Age']
X = data_insurance[feature_cols] # Dependent variables
y = data_insurance['f1_quartile'] # Target variable is Premium: Sum divided into quartiles

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # 80% training and 20% test

# Create Decision Tree classifer object
clf = DecisionTreeClassifier(max_depth=3)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
Accuracy: 0.35751295336787564
In [53]:
#Plot the decision tree

dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,feature_names = feature_cols,class_names=['q1','q2','q3','q4'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('decision_tree.png')
Image(graph.create_png())
Out[53]:
In [54]:
data_insurance.drop(['Customer Identity'],axis=1,inplace=True)

5. Clustering

5.1. Groupby Table

In [55]:
data_insurance.groupby(['Educational Degree','Geographic Living Area','Has Children (Y=1)'])['Customer Monetary Value'].count().sort_values(ascending=False)

#data_insurance.groupby(['Geographic Living Area','Educational Degree','Has Children (Y=1)'])['Claims Rate'].median().sort_values(ascending=False)

#data_insurance.groupby(['Has Children (Y=1)','Educational Degree','Geographic Living Area'])['Claims Rate'].median().sort_values(ascending=False)
Out[55]:
Educational Degree  Geographic Living Area  Has Children (Y=1)
2.00                4.00                    1.00                  1282
                    1.00                    1.00                  1002
1.00                4.00                    1.00                   972
2.00                3.00                    1.00                   689
1.00                1.00                    1.00                   685
2.00                4.00                    0.00                   566
1.00                3.00                    1.00                   471
                    4.00                    0.00                   413
2.00                1.00                    0.00                   385
                    2.00                    1.00                   326
0.00                4.00                    1.00                   273
2.00                3.00                    0.00                   271
1.00                1.00                    0.00                   267
                    2.00                    1.00                   232
0.00                1.00                    1.00                   220
3.00                4.00                    1.00                   181
1.00                3.00                    0.00                   180
3.00                1.00                    1.00                   149
0.00                3.00                    1.00                   139
2.00                2.00                    0.00                   132
0.00                4.00                    0.00                   127
1.00                2.00                    0.00                   101
3.00                3.00                    1.00                    94
                    4.00                    0.00                    82
0.00                1.00                    0.00                    74
                    2.00                    1.00                    71
3.00                1.00                    0.00                    63
0.00                3.00                    0.00                    60
3.00                2.00                    1.00                    48
                    3.00                    0.00                    41
0.00                2.00                    0.00                    32
3.00                2.00                    0.00                    18
Name: Customer Monetary Value, dtype: int64
In [56]:
data_insurance.drop(['f1_quartile','f2_quartile','Premium: Sum'],axis=1, inplace=True)

5.2. K-Means

In [57]:
#Finding the number of clusters by the elbow graph.
dat_ins=data_insurance.drop(['Educational Degree','Geographic Living Area','Has Children (Y=1)'],axis=1)
a=MinMaxScaler()
dat_ins=pd.DataFrame(a.fit_transform(dat_ins))
K=list(range(1, 11))
list_distance = []

for aux in range(len(K)):
    centroids,labels,_ = k_means(dat_ins,n_clusters=K[aux],n_init=50,n_jobs=8)
    
    total_distance = 0

    L=[]
    my_labels = list(range(len(centroids)))

    j=0
    while j < len(my_labels):
        i=0
        while i < dat_ins.shape[0]:
            if labels[i] == my_labels[j]:
                newDistance=euclidean(centroids[j,:],dat_ins.iloc[i,:])
                L.append(newDistance)
            i = i+1
        j = j+1

    total_distance = sum(L)
    list_distance.append(total_distance)
    print(aux)
fig, ax = plt.subplots(figsize=(15,5))
plt.plot(K, list_distance, 'bx-') 
plt.xlabel('Values of K') 
plt.ylabel('Distance') 
plt.title('Elbow Graph') 
plt.savefig("images/elbowplot.png")
plt.show()
0
1
2
3
4
5
6
7
8
9
In [58]:
#Comparing with the number of clusters by the hierarchical clustering.

plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(10,10))
Z = linkage(dat_ins,
       method='ward',
            metric='euclidean',
            optimal_ordering=False)

dendrogram(Z,
           truncate_mode='level',
           p=2,
           orientation = 'top',
           leaf_rotation=45,
           leaf_font_size= 16,
           show_contracted=False,
           show_leaf_counts=True)

plt.title('Truncated Hierarchical Clustering Dendrogram')
plt.xlabel('Cluster Size')
plt.ylabel('Distance')

plt.axhline(y=30,c='k')
plt.axhline(y=23.35,c='k')
plt.axhline(y=19,c='k')
plt.savefig("images/dendro.png")
plt.show()
In [59]:
from sklearn.cluster import AgglomerativeClustering
k = 4

dat_insu=data_insurance.drop(['Educational Degree','Geographic Living Area','Has Children (Y=1)'],axis=1)
a=MinMaxScaler()
dat_ins=pd.DataFrame(a.fit_transform(dat_insu))
dat_insu.columns=['Gross Monthly Salary','Customer Monetary Value',
       'Premiums in LOB: Motor', 'Premiums in LOB: Household',
       'Premiums in LOB: Health', 'Premiums in LOB:  Life',
       'Premiums in LOB: Work Compensations', 'First Policy´s Age']


Hclustering = AgglomerativeClustering(n_clusters=k,
                                      affinity='euclidean',
                                      linkage='ward')

#Replace the test with proper data
my_HC = Hclustering.fit(dat_ins)

my_labels = pd.DataFrame(my_HC.labels_)
my_labels.columns =  ['Labels']

da=dat_insu.reset_index()
ca=pd.DataFrame(pd.concat([da, my_labels], axis=1))
# Do the necessary transformations
In [60]:
final_result = ca.groupby(by='Labels').count()
In [61]:
dat_ins
Out[61]:
0 1 2 3 4 5 6 7
0 0.39 0.46 0.64 0.14 0.33 0.26 0.14 0.54
1 0.07 0.14 0.13 0.45 0.27 0.97 0.57 0.71
2 0.41 0.54 0.35 0.28 0.28 0.45 0.54 0.29
3 0.16 0.22 0.31 0.11 0.70 0.20 0.20 0.33
4 0.31 0.25 0.58 0.11 0.42 0.12 0.26 0.50
5 0.48 0.21 0.75 0.09 0.26 0.07 0.10 0.50
6 0.80 0.19 0.27 0.34 0.72 0.10 0.19 0.79
7 0.30 0.14 0.42 0.43 0.33 0.35 0.32 0.42
8 0.33 0.45 0.59 0.09 0.48 0.08 0.11 0.71
9 0.75 0.15 0.36 0.24 0.62 0.22 0.32 0.92
10 0.78 0.40 0.51 0.18 0.52 0.12 0.11 0.33
11 0.53 0.48 0.42 0.27 0.57 0.21 0.23 0.54
12 0.73 0.23 0.35 0.38 0.58 0.24 0.13 0.71
13 0.35 0.17 0.86 0.00 0.17 0.09 0.14 0.62
14 0.14 0.40 0.54 0.16 0.56 0.02 0.15 0.42
15 0.45 0.29 0.20 0.66 0.38 0.64 0.24 0.29
16 0.51 0.27 0.76 0.17 0.20 0.05 0.19 0.71
17 0.23 0.37 0.53 0.41 0.28 0.10 0.41 0.71
18 0.52 0.44 0.69 0.08 0.29 0.15 0.21 0.42
19 0.29 0.46 0.56 0.13 0.50 0.05 0.22 0.50
20 0.25 0.17 0.36 0.35 0.44 0.34 0.32 0.92
21 0.54 0.40 0.38 0.37 0.56 0.26 0.22 0.00
22 0.61 0.29 0.72 0.19 0.31 0.11 0.08 0.42
23 0.73 0.46 0.65 0.16 0.35 0.12 0.13 1.00
24 0.84 0.49 0.56 0.22 0.52 0.01 0.17 0.83
25 0.70 0.22 0.77 0.12 0.26 0.12 0.11 0.79
26 0.71 0.39 0.51 0.09 0.58 0.01 0.06 0.12
27 0.50 0.23 0.90 0.09 0.13 0.07 0.03 0.75
28 0.39 0.49 0.82 0.06 0.21 0.05 0.03 0.42
29 0.67 0.37 0.55 0.38 0.33 0.25 0.10 0.58
... ... ... ... ... ... ... ... ...
9616 0.58 0.21 0.90 0.09 0.13 0.07 0.08 0.12
9617 0.32 0.49 0.73 0.10 0.36 0.04 0.10 0.12
9618 0.41 0.38 0.56 0.31 0.45 0.09 0.19 0.33
9619 0.17 0.67 0.43 0.52 0.49 0.14 0.13 0.17
9620 0.57 0.59 0.88 0.15 0.13 0.08 0.09 0.25
9621 0.56 0.56 0.97 0.11 0.07 0.04 0.06 0.12
9622 0.51 0.52 0.84 0.07 0.17 0.12 0.09 0.75
9623 0.41 0.20 0.54 0.20 0.47 0.14 0.17 0.50
9624 0.67 0.41 0.37 0.41 0.58 0.26 0.14 0.79
9625 0.34 0.14 0.89 0.02 0.14 0.03 0.11 0.12
9626 0.59 0.27 0.58 0.08 0.35 0.49 0.23 0.25
9627 0.70 0.45 0.38 0.20 0.64 0.11 0.19 0.42
9628 0.60 0.45 0.69 0.20 0.33 0.13 0.14 0.21
9629 0.74 0.37 0.43 0.26 0.59 0.25 0.20 0.17
9630 0.44 0.18 0.85 0.02 0.17 0.06 0.09 0.58
9631 0.26 0.25 0.75 0.14 0.33 0.06 0.10 0.96
9632 0.74 0.44 0.59 0.10 0.41 0.17 0.18 0.67
9633 0.47 0.52 0.62 0.19 0.41 0.05 0.13 1.00
9634 0.47 0.25 0.79 0.13 0.25 0.04 0.09 1.00
9635 0.42 0.21 0.75 0.11 0.25 0.06 0.10 0.38
9636 0.44 0.22 0.88 0.08 0.13 0.09 0.09 0.33
9637 0.45 0.25 0.85 0.14 0.18 0.10 0.09 0.17
9638 0.31 0.56 0.65 0.27 0.29 0.14 0.12 0.83
9639 0.27 0.20 0.88 0.06 0.14 0.01 0.09 1.00
9640 0.50 0.48 0.73 0.09 0.27 0.09 0.12 0.42
9641 0.82 0.22 0.34 0.36 0.70 0.13 0.22 0.88
9642 0.54 0.18 0.88 0.03 0.14 0.04 0.09 0.42
9643 0.41 0.53 0.50 0.52 0.32 0.15 0.13 0.62
9644 0.25 0.22 0.67 0.10 0.41 0.00 0.14 0.75
9645 0.76 0.19 0.46 0.27 0.50 0.19 0.17 0.54

9646 rows × 8 columns

In [62]:
dat_ins.columns[1]
Out[62]:
1
In [63]:
ncl=4
dat_insu=data_insurance.drop(['Educational Degree','Geographic Living Area','Has Children (Y=1)'],axis=1)
a=MinMaxScaler()
dat_ins=pd.DataFrame(a.fit_transform(dat_insu))
dat_insu.columns=['Gross Monthly Salary','Customer Monetary Value',
       'Premiums in LOB: Motor', 'Premiums in LOB: Household',
       'Premiums in LOB: Health', 'Premiums in LOB:  Life',
       'Premiums in LOB: Work Compensations', 'First Policy´s Age']
centroids,labels,_=k_means(dat_ins,n_clusters=ncl,n_init=500,n_jobs=8)
In [64]:
ncl=4
fig = plt.figure(figsize=(10, 20))
cent=a.inverse_transform(centroids)
l=0
fig.suptitle(f'Kmeans with {dat_insu.columns[l]} as X-Axis')
#plt.show()
#plt.ylabel()
#plt.xlabel(data.columns[-1])
for j in range(0,8):
    if j == l:
        continue
    ax = fig.add_subplot(4, 2, j+1)
    
    plt.xlabel(f'{dat_insu.columns[l]}')
    plt.ylabel(f'{dat_insu.columns[j]}')
    for i in range(ncl):
        ax.scatter(dat_insu.iloc[labels==i,l],dat_insu.iloc[labels==i,j])
        ax.plot(cent[:,l],cent[:,j],'sk',markersize=10)

plt.savefig("images/kmeans"+str(l)+".png")
plt.show()

5.3. Mean Shift

In [65]:
from sklearn.cluster import MeanShift, estimate_bandwidth
dat_insu=data_insurance.drop(['Educational Degree','Geographic Living Area','Has Children (Y=1)'],axis=1)
a=MinMaxScaler()
dat_ins=pd.DataFrame(a.fit_transform(dat_insu))
dat_insu.columns=['Gross Monthly Salary','Customer Monetary Value',
       'Premiums in LOB: Motor', 'Premiums in LOB: Household',
       'Premiums in LOB: Health', 'Premiums in LOB:  Life',
       'Premiums in LOB: Work Compensations', 'First Policy´s Age']
# The following bandwidth can be automatically detected using

my_bandwidth = estimate_bandwidth(dat_ins, quantile=0.06, n_jobs=8)
print(my_bandwidth)

ms = MeanShift(bandwidth=my_bandwidth,
               #bandwidth=0.15,
               bin_seeding=True,
              n_jobs=8)

ms.fit(dat_ins)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
print(n_clusters_)
0.42697485591340206
3
In [66]:
ncl=3

cent=a.inverse_transform(cluster_centers)
for i in range(8):
    fig = plt.figure(figsize=(10, 20))
    l=i
    fig.suptitle(f'Mean Shift with {dat_insu.columns[l]} as X-Axis')

    #plt.show()
    #plt.ylabel()
    #plt.xlabel(data.columns[-1])
    for j in range(0,8):
        if j == l:
            continue
        ax = fig.add_subplot(4, 2, j+1)
        plt.xlabel(f'{dat_insu.columns[l]}')
        plt.ylabel(f'{dat_insu.columns[j]}')
        for i in range(ncl):
            ax.scatter(dat_insu.iloc[labels==i,l],dat_insu.iloc[labels==i,j])
            ax.plot(cent[:,l],cent[:,j],'sk',markersize=10)
    plt.savefig("images/mean"+str(l)+".png")
    #plt.show()
In [67]:
my_labels = pd.DataFrame(labels)
my_labels.columns =  ['Labels']

da=dat_insu.reset_index().drop(['Customer Identity'],axis=1)
ca=pd.DataFrame(pd.concat([da, my_labels], axis=1))
# Do the necessary transformations
In [68]:
final_result = ca.groupby(by='Labels').mean()
In [69]:
final_result
Out[69]:
Gross Monthly Salary Customer Monetary Value Premiums in LOB: Motor Premiums in LOB: Household Premiums in LOB: Health Premiums in LOB: Life Premiums in LOB: Work Compensations First Policy´s Age
Labels
0 2,609.18 204.18 328.70 153.50 170.24 29.71 29.32 32.00
1 1,990.55 189.22 105.01 434.70 171.54 103.10 123.12 30.93
2 1,941.77 582.91 113.56 697.15 163.63 120.86 52.66 33.99

5.4. Gaussian Mix

In [70]:
from sklearn.mixture import GaussianMixture
dat_insu=data_insurance.drop(['Educational Degree','Geographic Living Area','Has Children (Y=1)'],axis=1)
a=MinMaxScaler()
dat_ins=pd.DataFrame(a.fit_transform(dat_insu))
dat_insu.columns=['Gross Monthly Salary','Customer Monetary Value',
       'Premiums in LOB: Motor', 'Premiums in LOB: Household',
       'Premiums in LOB: Health', 'Premiums in LOB:  Life',
       'Premiums in LOB: Work Compensations', 'First Policy´s Age']
gmm = GaussianMixture(n_components= 4,
                              init_params='kmeans', # {‘kmeans’, ‘random’}, defaults to ‘kmeans’.
                              max_iter=1000,
                              n_init=20)

gmm.fit(dat_ins)


labels = gmm.predict(dat_ins)
cluster_centers=gmm.means_
print(len(np.unique(labels)))
cent=a.inverse_transform(cluster_centers)
4
In [71]:
ncl=4

for i in range(8):
    fig = plt.figure(figsize=(10, 20))
    fig.suptitle(f'Gaussian Mixture with {dat_insu.columns[l]} as X-Axis')
    l=i


    #plt.show()
    #plt.ylabel()
    #plt.xlabel(data.columns[-1])
    for j in range(0,8):
        if j == l:
            continue
        ax = fig.add_subplot(4, 2, j+1)
        plt.xlabel(f'{dat_insu.columns[l]}')
        plt.ylabel(f'{dat_insu.columns[j]}')
        for i in range(ncl):
            ax.scatter(dat_insu.iloc[labels==i,l],dat_insu.iloc[labels==i,j])
            ax.plot(cent[:,l],cent[:,j],'sk',markersize=10)
    plt.savefig("images/gaus"+str(l)+".png")
    #plt.show()

5.5. DBSCAN

In [72]:
from sklearn.cluster import DBSCAN
dat_insu=data_insurance.drop(['Educational Degree','Geographic Living Area','Has Children (Y=1)'],axis=1)
a=MinMaxScaler()
dat_ins=pd.DataFrame(a.fit_transform(dat_insu))
dat_insu.columns=['Gross Monthly Salary','Customer Monetary Value',
       'Premiums in LOB: Motor', 'Premiums in LOB: Household',
       'Premiums in LOB: Health', 'Premiums in LOB:  Life',
       'Premiums in LOB: Work Compensations', 'First Policy´s Age']
db = DBSCAN(eps= 0.25,
            min_samples=10,n_jobs=8).fit(dat_ins)

labels = db.labels_
print(len(np.unique(labels)))
4
In [73]:
for i in range(8):
    l=i
    ncl=len(np.unique(labels))
    fig = plt.figure(figsize=(10, 20))
    fig.suptitle(f'DBSCAN with {dat_insu.columns[l]} as X-Axis')
    


    #plt.show()
    #plt.ylabel()
    #plt.xlabel(data.columns[-1])
    for j in range(0,8):
        if j == l:
            continue
        ax = fig.add_subplot(4, 2, j+1)
        plt.xlabel(f'{dat_insu.columns[l]}')
        plt.ylabel(f'{dat_insu.columns[j]}')
        for i in range(ncl):
            ax.scatter(dat_insu.iloc[labels==i,l],dat_insu.iloc[labels==i,j])
    plt.savefig("images/dbscan"+str(l)+".png")
    #plt.show()

5.6. Kmodes

In [74]:
data_insurance.columns
Out[74]:
Index(['Educational Degree', 'Gross Monthly Salary', 'Geographic Living Area',
       'Has Children (Y=1)', 'Customer Monetary Value',
       'Premiums in LOB: Motor', 'Premiums in LOB: Household',
       'Premiums in LOB: Health', 'Premiums in LOB:  Life',
       'Premiums in LOB: Work Compensations', 'First Policy´s Age'],
      dtype='object')
In [75]:
from kmodes.kmodes import KModes

dat_ins=data_insurance.drop(['Gross Monthly Salary','Customer Monetary Value','Premiums in LOB: Motor','Premiums in LOB: Household','Premiums in LOB: Health','Premiums in LOB:  Life','Premiums in LOB: Work Compensations','First Policy´s Age'],axis=1)

dat_ins.columns=['Educational Degree', 'Geographic Living Area', 'Has Children (Y=1)']
km = KModes(n_clusters=2, init='random', n_init=128, verbose=1,n_jobs=8)

clusters = km.fit_predict(dat_ins)
Best run was number 14
In [76]:
ncl=len(np.unique(clusters))
fig = plt.figure(figsize=(10, 20))
l=0
fig.suptitle(f'KModes with {dat_ins.columns[l]} as X-Axis')
    



#plt.show()
#plt.ylabel()
#plt.xlabel(data.columns[-1])
for j in range(3):
    if j == l:
        continue
    ax = fig.add_subplot(2, 2, j)
    plt.xlabel(f'{dat_ins.columns[l]}')
    plt.ylabel(f'{dat_ins.columns[j]}')
    for i in range(ncl):
        ax.scatter(dat_ins.iloc[clusters==i,l],dat_ins.iloc[clusters==i,j])
plt.savefig("images/kmode.png")
plt.show()

Classify the outliers

In [78]:
outliers_data = pd.DataFrame(outliers_data)
outliers_data.isna().sum()
Out[78]:
Customer Identity                       0
Educational Degree                      0
Gross Monthly Salary                    0
Geographic Living Area                  0
Has Children (Y=1)                      0
Customer Monetary Value                 0
Claims Rate                             0
Premiums in LOB: Motor                 13
Premiums in LOB: Household              0
Premiums in LOB: Health                13
Premiums in LOB:  Life                 14
Premiums in LOB: Work Compensations    13
Age                                     0
First Policy´s Age                      0
Premium: Sum                            0
dtype: int64
In [79]:
len(outliers_data[outliers_data.isna().any(axis=1)])
Out[79]:
16
In [80]:
outliers_data.columns
Out[80]:
Index(['Customer Identity', 'Educational Degree', 'Gross Monthly Salary',
       'Geographic Living Area', 'Has Children (Y=1)',
       'Customer Monetary Value', 'Claims Rate', 'Premiums in LOB: Motor',
       'Premiums in LOB: Household', 'Premiums in LOB: Health',
       'Premiums in LOB:  Life', 'Premiums in LOB: Work Compensations', 'Age',
       'First Policy´s Age', 'Premium: Sum'],
      dtype='object')
In [81]:
col = ca.columns
outliers_data = outliers_data.loc[:,col]
outliers_data.drop('Labels', axis=1, inplace=True)
outliers_data.dropna(inplace=True, axis=0)
In [82]:
outliers_data.isna().sum()
Out[82]:
Gross Monthly Salary                   0
Customer Monetary Value                0
Premiums in LOB: Motor                 0
Premiums in LOB: Household             0
Premiums in LOB: Health                0
Premiums in LOB:  Life                 0
Premiums in LOB: Work Compensations    0
First Policy´s Age                     0
dtype: int64
In [83]:
clf = KNeighborsClassifier(n_neighbors=7)    
clf.fit(ca.iloc[:,:-1]  , ca.iloc[:,-1:])
Out[83]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='uniform')
In [84]:
pred = clf.predict(outliers_data)
outliers_data['Labels'] = pred
In [93]:
outliers_data.head(15)
Out[93]:
Gross Monthly Salary Customer Monetary Value Premiums in LOB: Motor Premiums in LOB: Household Premiums in LOB: Health Premiums in LOB: Life Premiums in LOB: Work Compensations First Policy´s Age Labels
13 1,043.00 -75.12 44.34 342.85 127.69 267.94 94.46 35.00 1
44 1,065.00 -128.68 111.80 -35.00 208.26 224.71 44.23 33.00 0
51 3,234.00 -14,714.08 557.44 20.00 29.56 5.00 -9.00 36.00 1
108 764.00 71.24 79.68 912.95 97.24 213.04 16.56 27.00 2
112 2,354.00 -8,719.04 518.32 4.45 55.90 3.89 10.89 21.00 1
135 2,176.00 -10,198.91 297.61 162.80 143.36 136.47 -3.00 21.00 1
149 984.00 255.71 64.90 197.25 29.56 18.56 451.53 29.00 1
171 1,086.00 -165,680.42 378.07 78.90 166.81 6.89 18.45 28.00 1
179 1,739.00 284.07 99.02 263.95 158.03 45.12 209.04 34.00 1
186 1,247.00 -128.24 33.23 1,026.30 82.57 75.68 212.15 25.00 1
191 1,486.00 872.07 106.02 587.90 76.46 22.45 225.60 22.00 1
257 380.00 1,105.42 50.90 1,012.40 221.93 10.78 128.80 27.00 2
301 1,497.00 1,356.71 41.23 1,089.10 69.68 101.24 150.14 42.00 2
316 1,569.00 14.23 92.35 1,025.75 195.26 54.12 53.12 38.00 0
339 1,288.00 455.43 11.67 245.05 183.70 223.71 104.91 39.00 1
In [ ]: